First load in in the necessary packages:
if(require('pacman')){
library('pacman')
}else{
install.packages('pacman')
library('pacman')
}
pacman::p_load(ggmap, sca, plotly, EnvStats, ggplot2, GGally, corrplot, dplyr,
tidyr, stringr, reshape2, cowplot, ggpubr, reshape2, ggplot2,
rmarkdown, dplyr, plotly, rstudioapi, wesanderson, RColorBrewer)
Load in the dataset and replace the missing data with the mean values
# Load Dataset ------------------------------------------------------------
train.df <- read.csv(file = "titanictrain.csv", TRUE, ",")
##Replace missing values with mean value
train.df$Age[is.na(train.df$Age)] <- mean(train.df$Age, na.rm = TRUE)
It can be useful to visualise the data before creating a model
Using Base Package plots the following visualisations can be created: ##Numeric
PlotCol.vec <- rainbow_hcl(3)
plot(x = train.df$Pclass, y = train.df$Age, col = PlotCol.vec[train.df$Survived + 1], pch = 16, cex = 2)
To plot with factors we will first need to create categorical variables
train.df$Pclass <- as.factor(train.df$Pclass)
train.df$Survived <- as.factor(train.df$Survived)
###Create the Plot (Box Plot)
PlotCol.vec <- rainbow_hcl(2)
plot(x = train.df$Pclass, y = train.df$Age, data = train.df,
col = PlotCol.vec[train.df$Survived], pch = 16,
xlab = "Passenger Class", ylab = "Passenger Age",
main = "Titanic Survivors")
GGPlot2 can create prettier plots:
###Create Categorical Factors
train.df$Pclass <- as.factor(train.df$Pclass)
train.df$Survived <- as.factor(train.df$Survived)
ggplot(train.df, aes(x = Pclass, y = Age, col = Survived)) +
geom_point(lwd = 4) +
labs(title = "Titanic Survivors", x = "Passenger Class", y = "Passenger Age") +
scale_color_manual(name="Passenger Fate",
labels=c("Survived", "Perished"),
values=rainbow_hcl(2))
First we will create various models to predict survival using a logistic regression model
twovar.mod <- glm(Survived ~ Age + Pclass, family = binomial(link = "logit"),
data = train.df)
fourvar.mod <- glm(Survived ~ I(Age>13) + Age + Pclass + Sex,
family = binomial(link = "logit"), data = train.df)
quad.mod <- glm(Survived ~ I(Age^2) + Pclass + Parch,
family = binomial(link = "logit"), data = train.df)
Next we will favour the model with the lowest AIC value (although we could go through the process of using Training and Validation errors but that’s a lot of effort):
##Select the lowest AIC
ModAIC.vec <- c(summary(twovar.mod)$aic, summary(fourvar.mod)$aic,
summary(quad.mod)$aic)
mod <- switch(EXPR = which.min(ModAIC.vec), twovar.mod, fourvar.mod, quad.mod)
print("Best fit: ") ; print(mod$call)
[1] "Best fit: "
glm(formula = Survived ~ I(Age > 13) + Age + Pclass + Sex, family = binomial(link = "logit"),
data = train.df)
pred <- predict(mod, type = 'response')
train.df$SurvivalProb <- pred
train.df <- train.df[,c(1, 2,13,3:12)]
Different probability thresholds will give different prediction values (as classified from the probability), this will give different rates of false positives and true positives for various threshold values. \ Note that Ground Truth refers to the observed value (e.g. survived/perished)
A ROC curve is a plot of the TruePos rate against the FalsePos rate, it is used to determine the best threshold value for making a prediction from a probability.
The sensitivity is the same as the True Positive Rate:
sensitivity <- function(probability, observation, threshold){
PredObs <- ifelse(probability < threshold, 0, 1)
#sensitivity is the rate of true positives, so (no of True Pos)/(no. of obs pos)
TruePosPred.n <- sum(as.numeric(PredObs == 1) * as.numeric(observation == 1))
#No. of true observations predictived to be true
ObsPos.n <- sum(observation == 1)
return(TruePosPred.n/ObsPos.n)
}
The Specificity is the same as the True Negative Rate
specificity <- function(probability, observation, threshold){
PredObs <- ifelse(probability < threshold, 0, 1)
#sensitivity is the rate of true positives, so (no of True Pos)/(no. of obs pos)
TrueNegPred.n <- sum(as.numeric(PredObs == 0) * as.numeric(observation == 0))
#No. of true observations predictived to be true
ObsPos.n <- sum(observation == 0)
return(TrueNegPred.n/ObsPos.n)
} #observation and ground_truth are synonymous
This is the same as sensitivity, but for the sake of clarity:
tpr <- sensitivity
This is equivalant to (1-specificity), so we will use the specificity function to define this one.
fpr <- function(probability, observation, threshold){
specificity.val <- specificity(probability, observation, threshold)
return(1 - specificity.val)
}
This is the measurment of the accuracy of the predictions
acc <- function(probability, observation, threshold){
PredObs <- ifelse(probability < threshold, 0, 1)
TrueNegPred.n <- sum(as.numeric(PredObs == 0) * as.numeric(observation == 0))
TruePosPred.n <- sum(as.numeric(PredObs == 1) * as.numeric(observation == 1))
totalpop <- nrow(train.df)
return((TrueNegPred.n+TruePosPred.n)/totalpop)
}
This is a measurment of how correct the predictions are:
cpredr <- function(probability, observation, threshold){
specificity.val <- specificity(probability, observation, threshold)
sensitivity.val <- sensitivity(probability, observation, threshold)
return(sensitivity.val + specificity.val)
}
For every different Probability Threshold there are differing rates of FalsePos and TruePos (and hence also sensitivity, specificity, TrueNegRate, FalseNegRate, Accuracy etc.).
This can be expressed in a dataframe, which will be created using a loop:
##Compute ROC values
#The ROC values are the truepos rate and falsepos rate that
#occur for various threshold values, they can be calculated with a loop:
###Create various threshold values
steps <- 10^3
threshold <- seq(0,1, length.out = steps)
###Create the dataframe
roc.df <- data.frame("Threshold" = 0, "FalsePosR" = 0, "TruePosR" = 0,
"CorrectPredRate" = 0, "Accuracy" = 0)
####The loop will run faster if the dataframe is already the correct size
#NRows is steps and NCol 4
roc.df[nrow(roc.df)+steps,] <- NA
###Create the dataframe
for(i in 1:steps){
roc.df[i, "Threshold"] <- threshold[i]
roc.df[i, "FalsePosR"] <- fpr(train.df$SurvivalProb, train.df$Survived, threshold[i])
roc.df[i, "TruePosR"] <- tpr(train.df$SurvivalProb, train.df$Survived, threshold[i])
roc.df[i, "CorrectPredRate"] <- cpredr(train.df$SurvivalProb, train.df$Survived, threshold[i])
roc.df[i, "Accuracy"] <- acc(train.df$SurvivalProb, train.df$Survived, threshold[i])
}
###Inspect the ROC Values
head(roc.df)
There are a few ways to use these values to decide on a probability threshold for example, taking the threshold corresponding to the maximum value of any of the following would be a reasonable choice for the probability threshold:
In this case we will use Accuracy, but the lecture notes use (specificity + Sensitivity), in this case it doesn’t matter because the threshold values are the same.
###Select the best Threshold value
best.roc.acc <- roc.df[which.max(roc.df$Accuracy), ]
best.roc.spsaddsns <- roc.df[which.max(roc.df$Accuracy), ]
best.roc.df <- as.data.frame(matrix(nrow = 2, ncol = ncol(best.roc.spsaddsns)))
best.roc.df[1,] <- best.roc.spsaddsns
best.roc.df[2,] <- best.roc.acc
names(best.roc.df) <- names(best.roc.acc)
best.roc.df
NA
cols.vec <- rainbow_hcl(3)
layout(matrix(nrow = 2, data = c(1,2,1,3)))
plot(y = roc.df$TruePosR, x = roc.df$FalsePosR, type = 'l',
xlab = "False Positive Rate", ylab = "True Positive Rate",
main = "Roc Curve", col = cols.vec[1], lwd = 2)
i <- which.max(roc.df$Accuracy)
points(roc.df$FalsePosR[i], roc.df$TruePosR[i], pch = 8)
#text(roc.df$FalsePosR[i], roc.df$TruePosR[i], paste("threshold = ", signif(roc.df$Threshold[i],2)), pos = 8)
plot(y = roc.df$Accuracy,
x = roc.df$TruePosR,
type = 'l', xlab = "True Positive Rate",
ylab = "Accuracy", main = "Accuracy~TPR",
col = cols.vec[2], lwd = 2)
points(y = roc.df$Accuracy[i], x = roc.df$TruePosR[i], pch = 8)
plot(y = roc.df$Accuracy, x = roc.df$FalsePosR,
type = 'l', xlab = "False Positive Rate",
ylab = "Accuracy", main = "Accuracy~FPR", col = cols.vec[3], lwd = 2)
points(y = roc.df$Accuracy[i], x = roc.df$FalsePosR[i], pch = 8)
NA
plot(y = roc.df$Accuracy, x = roc.df$Threshold, type = 'l', xlab = "Probability Threshold", ylab = "Accuracy", main = "Accuracy vs Prob. Threshold", col = cols.vec[1], lwd = 2)
i <- which.max(roc.df$Accuracy)
points(roc.df$Threshold[i], roc.df$Accuracy[i], pch = 8)
NA
##ggplot
ggplot(data = roc.df, aes( x = FalsePosR)) +
geom_line(data = roc.df, aes(x = FalsePosR, y = TruePosR, col = Accuracy)) +
geom_point(aes(x = roc.df$FalsePosR[i], y = roc.df$TruePosR[i]),
col = "IndianRed", size = 6) +
annotate("text", x = roc.df$FalsePosR[i]+0.2,
y = roc.df$TruePosR[i],
label = paste("Threshold = ",
round(roc.df$Threshold[i], 3)))
labs(x = "False Positive Rate",
y = "True Positive Rate", title = "ROC Curve")
$x
[1] "False Positive Rate"
$y
[1] "True Positive Rate"
$title
[1] "ROC Curve"
attr(,"class")
[1] "labels"
A 3d visualisation really drives the point home:
##Plotly
obs.plotly <- plot_ly(roc.df, x = ~FalsePosR, y = ~TruePosR, z = ~Accuracy, color = ~Threshold) %>%
add_markers() %>%
layout(title = "Observed Values", scene = list(xaxis = list(title = 'False Positive Rate'),
yaxis = list(title = 'True Positive Rate'),
zaxis = list(title = 'Accuracy ((TPR+TNR)/TotalPop)')))
obs.plotly
Ignoring 1 observationsIgnoring 1 observations
test.df <- read.csv(file = "titanictrain.csv", header = TRUE, sep = ",")
#Make as Factors
test.df$Pclass <- as.factor(train.df$Pclass)
test.df$Survived <- as.factor(train.df$Survived)
Use the model to find the probabilites corresponding to the survival rate in the testdata
test.df$prob
[1] 0.09503375 0.90624480 0.56692485 0.91205172 0.07186358 NA 0.32668982 0.25676618 0.56116061 0.85070164 0.81861912 0.85812289 0.09914312 0.06585500 0.63427878
[16] 0.68546464 0.25676618 NA 0.53795346 NA 0.20279575 0.20661199 0.62882408 0.47160363 0.80427845 0.49700347 NA 0.52429796 NA NA
[31] 0.40251193 NA NA 0.10952446 0.47160363 0.39128995 NA 0.09706903 0.61226664 0.63427878 0.48528678 0.80773683 NA 0.93819416 0.60668721
[46] NA NA NA NA 0.61226664 0.23504251 0.09706903 0.88192243 0.80035068 0.27268252 NA 0.82864040 0.08271348 0.93541907 0.21860384
[61] 0.09503375 0.90624480 0.37467357 0.24792152 NA NA 0.80035068 0.10125656 0.61781678 0.08727005 0.21440306 0.62333634 0.26100989 0.08727005 0.07669779
[76] 0.08915543 NA NA 0.53844387 0.54377470 0.09503375 0.08182853 NA 0.47160363 0.84154773 0.52628146 0.10783829 NA 0.93215349 0.09107749
[91] 0.08182853 0.09914312 0.36919764 0.08727005 0.04224878 NA 0.24569502 0.50087530 0.78096152 0.20661199 0.55537982 NA 0.51259354 0.07505414 0.06879857
[106] 0.08360713 0.59544578 NA 0.06731186 NA 0.36375520 0.63155564 0.09503375 0.60107981 0.61781678 0.09706903 0.03259074 0.22648599 0.49501501 0.82547680
[121] 0.26100989 NA 0.21243547 0.78691703 0.32668982 0.21462602 NA 0.09107749 NA 0.05771301 0.07505414 0.09914312 0.44448913 0.80035068 0.24333280
[136] 0.25206863 0.93784869 0.41953461 0.10783829 0.49501501 NA 0.58978647 0.57839767 0.10125656 0.27480062 0.27015376 0.08542079 0.80056203 0.19717038 0.17755455
[151] 0.14880803 0.93362109 0.04569623 0.06372460 NA 0.34234319 0.62333634 0.08008446 NA NA 0.05900113 0.75595470 0.08727005 0.10560361 0.26126520
[166] 0.22671788 NA 0.45609365 NA 0.08360713 0.29166988 0.24792152 0.82882821 0.09706903 0.31646211 0.10340988 NA 0.87945938 0.22240556 0.07031564
[181] NA NA 0.22671788 0.53745333 0.81861912 NA NA 0.37467357 0.06442750 0.07031564 0.78887577 0.27015376 0.60668721 0.52577983 0.89359227
[196] 0.85812289 NA 0.06165856 NA 0.81842257 0.08360713 NA 0.07344291 0.05707890 0.10340988 0.82547680 0.07669779 0.08727005 0.62333634 0.40251193
[211] 0.09107749 0.77692515 0.09503375 0.22240556 NA 0.91928821 0.56116061 0.17755455 0.91753170 0.22240556 0.10783829 0.23480468 0.05052322 NA 0.41383694
[226] 0.09503375 0.27015376 0.09810119 0.27480062 NA 0.91205172 0.08182853 0.12658298 0.81511241 0.24767488 NA 0.17081136 0.93103836 0.27015376 0.21048109
[241] NA NA 0.22648599 0.09503375 0.08008446 0.38018175 0.57267103 0.81842257 0.41953461 0.14011851 NA 0.54958401 0.28685059 0.08008446 0.47943369
[256] 0.54958401 NA 0.92101052 0.91205172 0.71016993 NA 0.25231818 0.33708507 0.40251193 NA 0.19903232 0.10783829 0.08915543 0.85812289 0.91205172
[271] NA 0.08915543 0.75160409 0.41953461 NA 0.84324547 0.45609365 NA 0.23504251 0.51458066 0.03691003 0.08360713 0.10783829 0.10125656 NA
[286] 0.07505414 0.08008446 0.09503375 0.17755455 0.58978647 0.92756841 0.93784869 0.19903232 0.57839767 0.09107749 NA 0.09205244 0.97884531 NA 0.87945938
[301] NA NA 0.10125656 NA NA 0.77609983 NA 0.94052602 0.22240556 0.92101052 0.93065584 0.93920113 0.81135106 0.08360713 0.17415720
[316] 0.56692485 0.81842257 0.14011851 0.91928821 0.90218474 0.09503375 0.08542079 0.79657854 0.82528608 NA 0.91015312 0.04039188 0.77283604 0.53795346 0.94182382
[331] NA 0.37193150 0.41383694 0.10783829 NA NA 0.46576623 0.90009646 0.05771301 0.37467357 0.53162091 0.93065584 0.23061907 0.24333280 0.19903232
[346] 0.81842257 0.75595470 NA 0.25231818 0.06165856 0.09303675 NA 0.11011442 0.08915543 NA 0.08360713 0.93362109 0.76450002 NA NA
[361] 0.06442750 0.22648599 0.45609365 0.07186358 NA 0.08008446 0.85231851 NA NA 0.93065584 0.48915610 0.10340988 0.10125656 0.50673535 0.82207385
[376] NA 0.58978647 0.47744880 0.09914312
Use the probabilites to create predictions
#Assess the Model Accuracy
rate <- mean(na.omit(test.df$pred == test.df$Survived))
paste("Thus the model created from training data predicts survival on the test set at a success rate of", signif(rate*100, 3), "%") %>% print()
[1] "Thus the model created from training data predicts survival on the test set at a success rate of 80.7 %"